import pandas as pd
import numpy as np
import plotly
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("water_potability.csv")
df
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 204.890455 | 20791.318981 | 7.300212 | 368.516441 | 564.308654 | 10.379783 | 86.990970 | 2.963135 | 0 |
| 1 | 3.716080 | 129.422921 | 18630.057858 | 6.635246 | NaN | 592.885359 | 15.180013 | 56.329076 | 4.500656 | 0 |
| 2 | 8.099124 | 224.236259 | 19909.541732 | 9.275884 | NaN | 418.606213 | 16.868637 | 66.420093 | 3.055934 | 0 |
| 3 | 8.316766 | 214.373394 | 22018.417441 | 8.059332 | 356.886136 | 363.266516 | 18.436524 | 100.341674 | 4.628771 | 0 |
| 4 | 9.092223 | 181.101509 | 17978.986339 | 6.546600 | 310.135738 | 398.410813 | 11.558279 | 31.997993 | 4.075075 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3271 | 4.668102 | 193.681735 | 47580.991603 | 7.166639 | 359.948574 | 526.424171 | 13.894419 | 66.687695 | 4.435821 | 1 |
| 3272 | 7.808856 | 193.553212 | 17329.802160 | 8.061362 | NaN | 392.449580 | 19.903225 | NaN | 2.798243 | 1 |
| 3273 | 9.419510 | 175.762646 | 33155.578218 | 7.350233 | NaN | 432.044783 | 11.039070 | 69.845400 | 3.298875 | 1 |
| 3274 | 5.126763 | 230.603758 | 11983.869376 | 6.303357 | NaN | 402.883113 | 11.168946 | 77.488213 | 4.708658 | 1 |
| 3275 | 7.874671 | 195.102299 | 17404.177061 | 7.509306 | NaN | 327.459760 | 16.140368 | 78.698446 | 2.309149 | 1 |
3276 rows × 10 columns
df.describe()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 2785.000000 | 3276.000000 | 3276.000000 | 3276.000000 | 2495.000000 | 3276.000000 | 3276.000000 | 3114.000000 | 3276.000000 | 3276.000000 |
| mean | 7.080795 | 196.369496 | 22014.092526 | 7.122277 | 333.775777 | 426.205111 | 14.284970 | 66.396293 | 3.966786 | 0.390110 |
| std | 1.594320 | 32.879761 | 8768.570828 | 1.583085 | 41.416840 | 80.824064 | 3.308162 | 16.175008 | 0.780382 | 0.487849 |
| min | 0.000000 | 47.432000 | 320.942611 | 0.352000 | 129.000000 | 181.483754 | 2.200000 | 0.738000 | 1.450000 | 0.000000 |
| 25% | 6.093092 | 176.850538 | 15666.690297 | 6.127421 | 307.699498 | 365.734414 | 12.065801 | 55.844536 | 3.439711 | 0.000000 |
| 50% | 7.036752 | 196.967627 | 20927.833607 | 7.130299 | 333.073546 | 421.884968 | 14.218338 | 66.622485 | 3.955028 | 0.000000 |
| 75% | 8.062066 | 216.667456 | 27332.762127 | 8.114887 | 359.950170 | 481.792304 | 16.557652 | 77.337473 | 4.500320 | 1.000000 |
| max | 14.000000 | 323.124000 | 61227.196008 | 13.127000 | 481.030642 | 753.342620 | 28.300000 | 124.000000 | 6.739000 | 1.000000 |
df.shape
(3276, 10)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3276 entries, 0 to 3275 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ph 2785 non-null float64 1 Hardness 3276 non-null float64 2 Solids 3276 non-null float64 3 Chloramines 3276 non-null float64 4 Sulfate 2495 non-null float64 5 Conductivity 3276 non-null float64 6 Organic_carbon 3276 non-null float64 7 Trihalomethanes 3114 non-null float64 8 Turbidity 3276 non-null float64 9 Potability 3276 non-null int64 dtypes: float64(9), int64(1) memory usage: 256.1 KB
df.nunique()
ph 2785 Hardness 3276 Solids 3276 Chloramines 3276 Sulfate 2495 Conductivity 3276 Organic_carbon 3276 Trihalomethanes 3114 Turbidity 3276 Potability 2 dtype: int64
df['Potability'].value_counts()
Potability 0 1998 1 1278 Name: count, dtype: int64
There is an imbalance of data for the target variable. Non poptable samples have been oversampled versus the potatble samples.
sns.countplot(data = df, x = 'Potability')
<Axes: xlabel='Potability', ylabel='count'>
df.Potability.value_counts().plot(kind ='pie')
<Axes: ylabel='count'>
non_potable = df[df['Potability'] == 0]
potable = df[df['Potability'] == 1]
plt.figure(figsize=(13,13))
for ax,col in enumerate(df.columns[:9]):
plt.subplot(3,3,ax+1)
plt.title(col)
sns.kdeplot(x=non_potable[col],label = "Non Potable")
sns.kdeplot(x=potable[col],label = "Potable")
plt.legend()
plt.suptitle('Water Quality Distribution',y=1.01, size = 16, color = 'black', weight='bold')
plt.tight_layout()
df.drop('Potability', axis=1).skew()
ph 0.025630 Hardness -0.039342 Solids 0.621634 Chloramines -0.012098 Sulfate -0.035947 Conductivity 0.264490 Organic_carbon 0.025533 Trihalomethanes -0.083031 Turbidity -0.007817 dtype: float64
Distirbution of the data is mostly Gaussian.
df.corr()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| ph | 1.000000 | 0.082096 | -0.089288 | -0.034350 | 0.018203 | 0.018614 | 0.043503 | 0.003354 | -0.039057 | -0.003556 |
| Hardness | 0.082096 | 1.000000 | -0.046899 | -0.030054 | -0.106923 | -0.023915 | 0.003610 | -0.013013 | -0.014449 | -0.013837 |
| Solids | -0.089288 | -0.046899 | 1.000000 | -0.070148 | -0.171804 | 0.013831 | 0.010242 | -0.009143 | 0.019546 | 0.033743 |
| Chloramines | -0.034350 | -0.030054 | -0.070148 | 1.000000 | 0.027244 | -0.020486 | -0.012653 | 0.017084 | 0.002363 | 0.023779 |
| Sulfate | 0.018203 | -0.106923 | -0.171804 | 0.027244 | 1.000000 | -0.016121 | 0.030831 | -0.030274 | -0.011187 | -0.023577 |
| Conductivity | 0.018614 | -0.023915 | 0.013831 | -0.020486 | -0.016121 | 1.000000 | 0.020966 | 0.001285 | 0.005798 | -0.008128 |
| Organic_carbon | 0.043503 | 0.003610 | 0.010242 | -0.012653 | 0.030831 | 0.020966 | 1.000000 | -0.013274 | -0.027308 | -0.030001 |
| Trihalomethanes | 0.003354 | -0.013013 | -0.009143 | 0.017084 | -0.030274 | 0.001285 | -0.013274 | 1.000000 | -0.022145 | 0.007130 |
| Turbidity | -0.039057 | -0.014449 | 0.019546 | 0.002363 | -0.011187 | 0.005798 | -0.027308 | -0.022145 | 1.000000 | 0.001581 |
| Potability | -0.003556 | -0.013837 | 0.033743 | 0.023779 | -0.023577 | -0.008128 | -0.030001 | 0.007130 | 0.001581 | 1.000000 |
plt.figure(figsize = (12,10))
cmap = sns.diverging_palette(250, 30, l=65, center="dark", as_cmap=True)
sns.heatmap(df.corr(), annot = True, cmap=cmap)
plt.suptitle('Water Quality Heat Map', y = 0.92, size = 16, color = 'black', weight='bold')
Text(0.5, 0.92, 'Water Quality Heat Map')
df.corr().abs()['Potability'].sort_values(ascending = False)
Potability 1.000000 Solids 0.033743 Organic_carbon 0.030001 Chloramines 0.023779 Sulfate 0.023577 Hardness 0.013837 Conductivity 0.008128 Trihalomethanes 0.007130 ph 0.003556 Turbidity 0.001581 Name: Potability, dtype: float64
ax = sns.pairplot(df, hue = "Potability", diag_kind = "kde", kind = "scatter")
ax.fig.set_size_inches(16,16)
ax.fig.suptitle('Water Quality Pair-Plot',y=1.01, size = 16, color = 'black', weight='bold')
Text(0.5, 1.01, 'Water Quality Pair-Plot')
Both the correlation heatmap and pairplots between the variables show that there are no clear linear relationships between the features and the target variable. We can rule out using a simple linear model.
fig, ax = plt.subplots(ncols=5, nrows=2, figsize=(30,10))
index = 0
ax = ax.flatten()
for attributeCol, value in df.items():
sns.boxplot(y=attributeCol, data=df, ax=ax[index], color="#cf7849")
index += 1
plt.tight_layout(pad=1.5, w_pad=1.5, h_pad=5.0)
plt.suptitle('Water Quality Box-Plot',y=1.01, size = 16, color = 'black', weight='bold')
Text(0.5, 1.01, 'Water Quality Box-Plot')
There are outliers within every feature of the dataset.
df.isna().any().any()
True
df.isnull().sum()
ph 491 Hardness 0 Solids 0 Chloramines 0 Sulfate 781 Conductivity 0 Organic_carbon 0 Trihalomethanes 162 Turbidity 0 Potability 0 dtype: int64
df.isnull().mean().plot.bar(figsize=(12,6), color="#cf7849")
plt.ylabel('Percentage of missing values')
plt.xlabel('Features')
plt.title('Missing Data in Percentages');
df.isnull().mean()*100
ph 14.987790 Hardness 0.000000 Solids 0.000000 Chloramines 0.000000 Sulfate 23.840049 Conductivity 0.000000 Organic_carbon 0.000000 Trihalomethanes 4.945055 Turbidity 0.000000 Potability 0.000000 dtype: float64
df[df['Potability']==0][['ph','Sulfate','Trihalomethanes']].mean()
ph 7.085378 Sulfate 334.564290 Trihalomethanes 66.303555 dtype: float64
df[df['Potability']==1][['ph','Sulfate','Trihalomethanes']].mean()
ph 7.073783 Sulfate 332.566990 Trihalomethanes 66.539684 dtype: float64
The three features with missing values all have Gaussian distribution therefore it is a safe assumption to use the mean of each feature to fill in for the missing values.
df1 = df.copy()
df1['ph'].fillna(value=df1['ph'].mean(),inplace=True)
df1['Sulfate'].fillna(value=df1['Sulfate'].mean(),inplace=True)
df1['Trihalomethanes'].fillna(value=df1['Trihalomethanes'].mean(),inplace=True)
df1.isnull().sum()
ph 0 Hardness 0 Solids 0 Chloramines 0 Sulfate 0 Conductivity 0 Organic_carbon 0 Trihalomethanes 0 Turbidity 0 Potability 0 dtype: int64
Since the features have Gaussian distribution we can trim out the outliers data outside of 3 standard deviations.
from scipy import stats
df_water = df1.copy()
df_water = df_water[(np.abs(stats.zscore(df1)) <= 3).all(axis=1)]
df_water.shape
(3128, 10)
fig, ax = plt.subplots(ncols=5, nrows=2, figsize=(30,10))
index = 0
ax = ax.flatten()
for attributeCol, value in df_water.items():
sns.boxplot(y=attributeCol, data=df_water, ax=ax[index], color='#597eb3')
index += 1
plt.tight_layout(pad=1.5, w_pad=1.5, h_pad=5.0)
plt.suptitle('Water Quality Box-Plot (after outlier trimming)',y=1.01, size = 16, color = 'black', weight='bold')
Text(0.5, 1.01, 'Water Quality Box-Plot (after outlier trimming)')
count_class_0, count_class_1 = df_water.Potability.value_counts()
df_class_0 = df_water[df_water['Potability'] == 0]
df_class_1 = df_water[df_water['Potability'] == 1]
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)
print('Random over-sampling:')
print(df_test_over.Potability.value_counts())
df_test_over.Potability.value_counts().plot(kind='pie', title='Count (target)');
Random over-sampling: Potability 0 1930 1 1930 Name: count, dtype: int64
from sklearn.model_selection import train_test_split
X = df_test_over.drop('Potability',axis=1)
y = df_test_over['Potability']
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train=scale.fit_transform(X_train)
X_test=scale.transform(X_test)
print("X_train:", X_train.shape, "\ny_train:", y_train.shape)
print("X_test:", X_test.shape, "\ny_test:", y_test.shape)
X_train: (3088, 9) y_train: (3088,) X_test: (772, 9) y_test: (772,)
from sklearn.decomposition import PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train)
exp_var_pca = pca.explained_variance_ratio_
cum_sum_eigenvalues = np.cumsum(exp_var_pca)
plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid', label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.title("Water Quality PCA")
plt.legend(loc='best')
plt.show()
PCA tells us that all the features are independent and confirm our previous observations from the correlation matrix and heatmap that there are not linear relationships between the features. This PCA analysis of explained variance ratio tells us that we would still need 8 components to explain 80% of the variance. Therefore, dimensionality reduction will not be beneficial with this dataset.
from sklearn.metrics import precision_score, accuracy_score, mean_absolute_error, mean_squared_error, confusion_matrix, classification_report, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)
LogReg_pred = LogReg.predict(X_test)
LogReg_acc = accuracy_score(y_test, LogReg_pred)
print("The accuracy for Logistic Regression is", LogReg_acc)
print("The classification report using Logistic Regression is:")
print(classification_report(y_test, LogReg_pred))
The accuracy for Logistic Regression is 0.45466321243523317
The classification report using Logistic Regression is:
precision recall f1-score support
0 0.47 0.40 0.43 400
1 0.44 0.51 0.48 372
accuracy 0.45 772
macro avg 0.46 0.46 0.45 772
weighted avg 0.46 0.45 0.45 772
LogReg_cm = confusion_matrix(y_test, LogReg_pred)
sns.heatmap(LogReg_cm/np.sum(LogReg_cm), annot = True, fmt = '0.2%', cmap = 'Oranges')
plt.title("Logisitic Regression Confusion Matrix")
Text(0.5, 1.0, 'Logisitic Regression Confusion Matrix')
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)
KNN_pred = KNN.predict(X_test)
KNN_acc = accuracy_score(y_test, KNN_pred)
print("The accuracy for KNeighbors is", KNN_acc)
print("The classification report using KNeighbors is:", KNN_acc)
print(classification_report(y_test, KNN_pred))
The accuracy for KNeighbors is 0.655440414507772
The classification report using KNeighbors is: 0.655440414507772
precision recall f1-score support
0 0.69 0.60 0.64 400
1 0.62 0.72 0.67 372
accuracy 0.66 772
macro avg 0.66 0.66 0.66 772
weighted avg 0.66 0.66 0.65 772
KNN_cm = confusion_matrix(y_test, KNN_pred)
sns.heatmap(KNN_cm/np.sum(KNN_cm), annot = True, fmt = '0.2%', cmap = 'Oranges')
plt.title("KNN Regression Confusion Matrix")
Text(0.5, 1.0, 'KNN Regression Confusion Matrix')
DecTree = DecisionTreeClassifier()
DecTree.fit(X_train, y_train)
DecTree_pred = DecTree.predict(X_test)
DecTree_acc = accuracy_score(y_test, DecTree_pred)
print("The accuracy for Decision Tree is", DecTree_acc)
print("The classification report using Decision Tree is:")
print(classification_report(y_test, DecTree_pred))
The accuracy for Decision Tree is 0.7797927461139896
The classification report using Decision Tree is:
precision recall f1-score support
0 0.84 0.71 0.77 400
1 0.73 0.85 0.79 372
accuracy 0.78 772
macro avg 0.79 0.78 0.78 772
weighted avg 0.79 0.78 0.78 772
DecTree_cm = confusion_matrix(y_test, DecTree_pred)
sns.heatmap(DecTree_cm/np.sum(DecTree_cm), annot = True, fmt = '0.2%', cmap = 'Oranges')
plt.title("Decision Tree Confusion Matrix")
Text(0.5, 1.0, 'Decision Tree Confusion Matrix')
RFTree = RandomForestClassifier()
RFTree.fit(X_train, y_train)
RFTree_pred = RFTree.predict(X_test)
RFTree_acc = accuracy_score(y_test, RFTree_pred)
print("The accuracy for Random Forest is", RFTree_acc)
print("The classification report using Random Forest is:")
print(classification_report(y_test, RFTree_pred))
The accuracy for Random Forest is 0.8238341968911918
The classification report using Random Forest is:
precision recall f1-score support
0 0.83 0.83 0.83 400
1 0.82 0.81 0.82 372
accuracy 0.82 772
macro avg 0.82 0.82 0.82 772
weighted avg 0.82 0.82 0.82 772
RFTree_cm = confusion_matrix(y_test, RFTree_pred)
sns.heatmap(RFTree_cm/np.sum(RFTree_cm), annot = True, fmt = '0.2%', cmap = 'Oranges')
plt.title("Random Forest Confusion Matrix")
Text(0.5, 1.0, 'Random Forest Confusion Matrix')
SVM = SVC()
SVM.fit(X_train, y_train)
SVM_pred = SVM.predict(X_test)
SVM_acc = accuracy_score(y_test, SVM_pred)
print("The accuracy for SVM is", SVM_acc)
print("The classification report using SVM is:", SVM_acc)
print(classification_report(y_test, SVM_pred))
The accuracy for SVM is 0.6722797927461139
The classification report using SVM is: 0.6722797927461139
precision recall f1-score support
0 0.67 0.71 0.69 400
1 0.67 0.63 0.65 372
accuracy 0.67 772
macro avg 0.67 0.67 0.67 772
weighted avg 0.67 0.67 0.67 772
SVM_cm = confusion_matrix(y_test, SVM_pred)
sns.heatmap(SVM_cm/np.sum(SVM_cm), annot = True, fmt = '0.2%', cmap = 'Oranges')
plt.title("SVM Confusion Matrix")
Text(0.5, 1.0, 'SVM Confusion Matrix')
XGB = XGBClassifier()
XGB.fit(X_train, y_train)
XGB_pred = XGB.predict(X_test)
XGB_acc = accuracy_score(y_test, XGB_pred)
print("The accuracy for XGBoost is", XGB_acc)
print("The classification report using XGBoost is:", XGB_acc)
print(classification_report(y_test, XGB_pred))
The accuracy for XGBoost is 0.7940414507772021
The classification report using XGBoost is: 0.7940414507772021
precision recall f1-score support
0 0.83 0.76 0.79 400
1 0.76 0.84 0.80 372
accuracy 0.79 772
macro avg 0.80 0.80 0.79 772
weighted avg 0.80 0.79 0.79 772
XGB_cm = confusion_matrix(y_test, XGB_pred)
sns.heatmap(XGB_cm/np.sum(XGB_cm), annot = True, fmt = '0.2%', cmap = 'Oranges')
plt.title("XGBoost Confusion Matrix")
Text(0.5, 1.0, 'XGBoost Confusion Matrix')
models = pd.DataFrame({
'Model':['Logistic Regression', 'KNN Regression', 'Decision Tree', 'Random Forest', 'Support Vector', 'XGBoost'],
'Accuracy' :[LogReg_acc, KNN_acc, DecTree_acc, RFTree_acc, SVM_acc, XGB_acc]
})
models = models.sort_values(by='Accuracy', ascending=False)
models
| Model | Accuracy | |
|---|---|---|
| 3 | Random Forest | 0.823834 |
| 5 | XGBoost | 0.794041 |
| 2 | Decision Tree | 0.779793 |
| 4 | Support Vector | 0.672280 |
| 1 | KNN Regression | 0.655440 |
| 0 | Logistic Regression | 0.454663 |
fig = plt.figure(figsize=(15,9))
ax = sns.barplot(data=models,
y='Model',
x='Accuracy',
palette = 'Oranges_r')
ax.figure.suptitle('Performance of Models -- First Iteration',y=0.91, size = 16, color = 'black', weight='bold');
plt.xlabel('Accuracy')
plt.ylabel('Models')
plt.axvline(x = 0.8, ymin = 0, ymax = 1,
linewidth = 2, linestyle =":",
color ='#cf7849');
After the first iteration of model training we can see that Logistic Regression performed the lowest while KNN, SVM and XGBoost had close accuracy values around 65%. Decision tree classifier had a 76% accuracy while Random Forest was able to be accurate to 83%
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
lgr = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svc = SVC()
xgb =XGBClassifier(eval_metric = 'logloss', use_label_encoder=False)
para_lgr = {'solver':['newton-cg', 'lbfgs', 'sag', 'saga', 'liblinear'], 'penalty': ['l1', 'l2', 'elasticnet', 'none']}
grid_lgr = GridSearchCV(lgr, param_grid=para_lgr, cv=5)
grid_lgr.fit(X_train, y_train)
print("Best parameters for Logistic Regression:", grid_lgr.best_params_)
Best parameters for Logistic Regression: {'penalty': 'l1', 'solver': 'saga'}
para_knn = {'n_neighbors':np.arange(1, 50), 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
grid_knn = GridSearchCV(knn, param_grid=para_knn, cv=5)
grid_knn.fit(X_train, y_train)
print("Best parameters for KNN:", grid_knn.best_params_)
Best parameters for KNN: {'algorithm': 'auto', 'n_neighbors': 42, 'weights': 'distance'}
para_dt = {'criterion':['gini','entropy'],'max_depth':np.arange(1, 50), 'min_samples_leaf':[1,2,4,5,10,20,30,40,80,100]}
grid_dt = GridSearchCV(dt, param_grid=para_dt, cv=5)
grid_dt.fit(X_train, y_train)
print("Best parameters for Decision Tree:", grid_dt.best_params_)
Best parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 41, 'min_samples_leaf': 1}
params_rf = {'n_estimators':[100,200, 350, 500], 'min_samples_leaf':[2, 10, 30]}
grid_rf = GridSearchCV(rf, param_grid=params_rf, cv=5)
grid_rf.fit(X_train, y_train)
print("Best parameters for Random Forest:", grid_rf.best_params_)
Best parameters for Random Forest: {'min_samples_leaf': 2, 'n_estimators': 350}
para_svc = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']}
grid_svc = GridSearchCV(svc, param_grid=para_svc, cv=5)
grid_svc.fit(X_train, y_train)
print("Best parameters for SVM:", grid_svc.best_params_)
Best parameters for SVM: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
params_xgb = {'n_estimators': [50,100,250,400,600,800,1000], 'learning_rate': [0.2,0.5,0.8,1]}
rs_xgb = RandomizedSearchCV(xgb, param_distributions=params_xgb, cv=5)
rs_xgb.fit(X_train, y_train)
print("Best parameters for XGBoost:", rs_xgb.best_params_)
Best parameters for XGBoost: {'n_estimators': 600, 'learning_rate': 0.5}
LogReg2 = LogisticRegression(penalty = 'l1', solver= 'saga')
LogReg2.fit(X_train, y_train)
LogReg2_pred = LogReg2.predict(X_test)
LogReg2_acc = accuracy_score(y_test, LogReg2_pred)
print("The accuracy for Logistic Regression is", LogReg2_acc)
print("The classification report using Logistic Regression is:")
print(classification_report(y_test, LogReg2_pred))
The accuracy for Logistic Regression is 0.45466321243523317
The classification report using Logistic Regression is:
precision recall f1-score support
0 0.47 0.40 0.43 400
1 0.44 0.51 0.47 372
accuracy 0.45 772
macro avg 0.46 0.46 0.45 772
weighted avg 0.46 0.45 0.45 772
LogReg2_cm = confusion_matrix(y_test, LogReg2_pred)
sns.heatmap(LogReg2_cm/np.sum(LogReg2_cm), annot = True, fmt = '0.2%', cmap = 'Blues')
plt.title("Logisitic Regression Confusion Matrix")
Text(0.5, 1.0, 'Logisitic Regression Confusion Matrix')
KNN2 = KNeighborsClassifier(algorithm = 'auto', n_neighbors = 42, weights ='distance')
KNN2.fit(X_train, y_train)
KNN2_pred = KNN2.predict(X_test)
KNN2_acc = accuracy_score(y_test, KNN2_pred)
print("The accuracy for KNeighbors is", KNN2_acc)
print("The classification report using KNeighbors is:", KNN2_acc)
print(classification_report(y_test, KNN2_pred))
The accuracy for KNeighbors is 0.772020725388601
The classification report using KNeighbors is: 0.772020725388601
precision recall f1-score support
0 0.82 0.72 0.77 400
1 0.73 0.83 0.78 372
accuracy 0.77 772
macro avg 0.78 0.77 0.77 772
weighted avg 0.78 0.77 0.77 772
KNN2_cm = confusion_matrix(y_test, KNN2_pred)
sns.heatmap(KNN2_cm/np.sum(KNN2_cm), annot = True, fmt = '0.2%', cmap = 'Blues')
plt.title("KNN Regression Confusion Matrix")
Text(0.5, 1.0, 'KNN Regression Confusion Matrix')
DecTree2 = DecisionTreeClassifier(criterion = 'entropy', max_depth = 41, min_samples_leaf = 1)
DecTree2.fit(X_train, y_train)
DecTree2_pred = DecTree2.predict(X_test)
DecTree2_acc = accuracy_score(y_test, DecTree2_pred)
print("The accuracy for Decision Tree is", DecTree2_acc)
print("The classification report using Decision Tree is:")
print(classification_report(y_test, DecTree2_pred))
The accuracy for Decision Tree is 0.7797927461139896
The classification report using Decision Tree is:
precision recall f1-score support
0 0.84 0.71 0.77 400
1 0.73 0.85 0.79 372
accuracy 0.78 772
macro avg 0.79 0.78 0.78 772
weighted avg 0.79 0.78 0.78 772
DecTree2_cm = confusion_matrix(y_test, DecTree2_pred)
sns.heatmap(DecTree2_cm/np.sum(DecTree2_cm), annot = True, fmt = '0.2%', cmap = 'Blues')
plt.title("Decision Tree Confusion Matrix")
Text(0.5, 1.0, 'Decision Tree Confusion Matrix')
RFTree2 = RandomForestClassifier(min_samples_leaf = 2, n_estimators = 350)
RFTree2.fit(X_train, y_train)
RFTree2_pred = RFTree2.predict(X_test)
RFTree2_acc = accuracy_score(y_test, RFTree2_pred)
print("The accuracy for Random Forest is", RFTree2_acc)
print("The classification report using Random Forest is:")
print(classification_report(y_test, RFTree2_pred))
The accuracy for Random Forest is 0.8354922279792746
The classification report using Random Forest is:
precision recall f1-score support
0 0.84 0.84 0.84 400
1 0.83 0.83 0.83 372
accuracy 0.84 772
macro avg 0.84 0.84 0.84 772
weighted avg 0.84 0.84 0.84 772
RFTree2_cm = confusion_matrix(y_test, RFTree2_pred)
sns.heatmap(RFTree2_cm/np.sum(RFTree2_cm), annot = True, fmt = '0.2%', cmap = 'Blues')
plt.title("Random Forest Confusion Matrix")
Text(0.5, 1.0, 'Random Forest Confusion Matrix')
SVM2 = SVC(C = 1, gamma = 1, kernel = 'rbf')
SVM2.fit(X_train, y_train)
SVM2_pred = SVM2.predict(X_test)
SVM2_acc = accuracy_score(y_test, SVM2_pred)
print("The accuracy for SVM is", SVM2_acc)
print("The classification report using SVM is:", SVM2_acc)
print(classification_report(y_test, SVM2_pred))
The accuracy for SVM is 0.8290155440414507
The classification report using SVM is: 0.8290155440414507
precision recall f1-score support
0 0.81 0.88 0.84 400
1 0.86 0.77 0.81 372
accuracy 0.83 772
macro avg 0.83 0.83 0.83 772
weighted avg 0.83 0.83 0.83 772
SVM2_cm = confusion_matrix(y_test, SVM2_pred)
sns.heatmap(SVM2_cm/np.sum(SVM2_cm), annot = True, fmt = '0.2%', cmap = 'Blues')
plt.title("SVM Confusion Matrix")
Text(0.5, 1.0, 'SVM Confusion Matrix')
XGB2 = XGBClassifier(n_estimators = 600, learning_rate = 0.5)
XGB2.fit(X_train, y_train)
XGB2_pred = XGB2.predict(X_test)
XGB2_acc = accuracy_score(y_test, XGB2_pred)
print("The accuracy for XGBoost is", XGB2_acc)
print("The classification report using XGBoost is:", XGB2_acc)
print(classification_report(y_test, XGB2_pred))
The accuracy for XGBoost is 0.8018134715025906
The classification report using XGBoost is: 0.8018134715025906
precision recall f1-score support
0 0.83 0.77 0.80 400
1 0.77 0.83 0.80 372
accuracy 0.80 772
macro avg 0.80 0.80 0.80 772
weighted avg 0.80 0.80 0.80 772
XGB2_cm = confusion_matrix(y_test, XGB2_pred)
sns.heatmap(XGB2_cm/np.sum(XGB2_cm), annot = True, fmt = '0.2%', cmap = 'Blues')
plt.title("XGBoost Confusion Matrix")
Text(0.5, 1.0, 'XGBoost Confusion Matrix')
models2 = pd.DataFrame({
'Model':['Logistic Regression', 'KNN Regression', 'Decision Tree', 'Random Forest', 'Support Vector', 'XGBoost'],
'Accuracy' :[LogReg2_acc, KNN2_acc, DecTree2_acc, RFTree2_acc, SVM2_acc, XGB2_acc],
})
models2 = models2.sort_values(by='Accuracy', ascending=False)
models2
| Model | Accuracy | |
|---|---|---|
| 3 | Random Forest | 0.835492 |
| 4 | Support Vector | 0.829016 |
| 5 | XGBoost | 0.801813 |
| 2 | Decision Tree | 0.779793 |
| 1 | KNN Regression | 0.772021 |
| 0 | Logistic Regression | 0.454663 |
fig = plt.figure(figsize=(15,9))
ax = sns.barplot(data=models2,
y='Model',
x='Accuracy',
palette = 'Blues_r')
ax.figure.suptitle('Performance of Models -- Second Iteration',y=0.91, size = 16, color = 'black', weight='bold');
plt.xlabel('Accuracy')
plt.ylabel('Models')
plt.axvline(x = 0.8, ymin = 0, ymax = 1,
linewidth = 2, linestyle =":",
color ='#cf7849');
comp_iterations = pd.DataFrame({
'Model':['Logistic Regression', 'KNN Regression', 'Decision Tree', 'Random Forest', 'Support Vector', 'XGBoost'],
'2nd Iteration' :[LogReg2_acc, KNN2_acc, DecTree2_acc, RFTree2_acc, SVM2_acc, XGB2_acc],
'1st Iteration' :[LogReg_acc, KNN_acc, DecTree_acc, RFTree_acc, SVM_acc, XGB_acc],
})
comp_iterations
| Model | 2nd Iteration | 1st Iteration | |
|---|---|---|---|
| 0 | Logistic Regression | 0.454663 | 0.454663 |
| 1 | KNN Regression | 0.772021 | 0.655440 |
| 2 | Decision Tree | 0.779793 | 0.779793 |
| 3 | Random Forest | 0.835492 | 0.823834 |
| 4 | Support Vector | 0.829016 | 0.672280 |
| 5 | XGBoost | 0.801813 | 0.794041 |
comp_iterations.plot.bar(figsize=(12,6))
plt.xticks([0,1,2,3,4,5],['Logistic Regression', 'KNN Regression', 'Decision Tree', 'Random Forest', 'Support Vector', 'XGBoost'])
plt.ylabel('Percentage of Accuracy')
plt.xlabel('Model')
plt.title('Comparison between Iterations');
After our second iteration of model (with hypertuning) we can see that Random Forest Classifier algortihm performed the best with the highest accuracy.
# Sample Data From Dataset to check model predict correct value or not.
# this is example of " Water is Not Potable" ( Potability = 0 )
input_data = (8.316766 ,214.373394 ,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771)
# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)
# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
# standardize the input data
std_data = scale.transform(input_data_reshaped)
print(std_data)
prediction = RFTree2.predict(std_data)
print(prediction)
if (prediction[0] == 0):
print('The Water is not Potable')
else:
print('The Water is Potable')
[[ 0.88132315 0.57348019 0.01412136 0.63002583 0.66554207 -0.77528312 1.25509008 2.20125291 0.8618897 ]] [0] The Water is not Potable
# Sample Data From Dataset to check model predict correct value or not.
# this is example of " Water is Potable" ( Potability = 1 )
input_data = (6.069616,186.659040,26138.780191,7.747547,345.700257,415.886955,12.067620,60.419921,3.669712)
# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)
# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
# standardize the input data
std_data = scale.transform(input_data_reshaped)
print(std_data)
prediction = RFTree2.predict(std_data)
print(prediction)
if (prediction[0] == 0):
print('The Water is not Portable')
else:
print('The Water is Portable')
[[-0.76708686 -0.28166272 0.49714254 0.42470761 0.33940143 -0.1192583 -0.67385627 -0.37953875 -0.38546659]] [1] The Water is Portable
#Saving Sciketlearn Model
import joblib
joblib.dump ( SVM2 , "Water_Potability_model.pkl")
['Water_Portability_model.pkl']
import pickle
filename = 'Water_Potability_model.sav'
pickle.dump( SVM2 , open(filename, 'wb'))
# some time later...
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result,'% Acuuracy')
0.8290155440414507 % Acuuracy